#LOADING THE DATASET

library(wooldridge)

data2<-happiness
data2
str(data2)
## 'data.frame':    17137 obs. of  33 variables:
##  $ year       : int  1994 1994 1994 1994 1994 1994 1994 1994 1994 1994 ...
##  $ workstat   : Factor w/ 10 levels "iap","working fulltime",..: 8 2 2 2 3 3 2 2 2 2 ...
##  $ prestige   : int  46 22 29 42 36 43 20 44 42 46 ...
##  $ divorce    : Factor w/ 5 levels "iap","yes","no",..: NA 3 3 2 NA NA 3 NA 3 NA ...
##  $ widowed    : Factor w/ 5 levels "iap","yes","no",..: 1 1 1 1 NA NA NA NA NA 1 ...
##  $ educ       : int  12 12 12 8 13 15 9 12 12 12 ...
##  $ reg16      : Factor w/ 10 levels "foreign","new england",..: 3 1 1 1 3 3 3 1 1 3 ...
##  $ babies     : int  2 0 0 0 0 0 0 0 2 0 ...
##  $ preteen    : int  3 0 0 0 1 0 1 0 1 0 ...
##  $ teens      : int  0 0 0 0 1 0 1 0 0 1 ...
##  $ income     : Factor w/ 16 levels "iap","lt $1000",..: 10 NA 11 11 10 13 12 10 13 11 ...
##  $ region     : Factor w/ 10 levels "not assigned",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ attend     : Factor w/ 10 levels "never","lt once a year",..: 4 8 9 3 3 4 9 9 4 9 ...
##  $ happy      : Factor w/ 6 levels "iap","very happy",..: 3 2 3 4 4 3 4 2 3 3 ...
##  $ owngun     : Factor w/ 6 levels "iap","yes","no",..: NA NA 1 NA 1 1 1 NA 1 1 ...
##  $ tvhours    : int  2 3 1 3 NA 0 10 4 2 NA ...
##  $ vhappy     : int  0 1 0 0 0 0 0 1 0 0 ...
##  $ mothfath16 : int  1 0 0 0 0 0 1 1 0 1 ...
##  $ black      : int  1 1 1 0 1 1 0 1 1 1 ...
##  $ gwbush04   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ female     : int  1 0 1 0 1 1 1 0 1 1 ...
##  $ blackfemale: int  1 0 1 0 1 1 0 0 1 1 ...
##  $ gwbush00   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ occattend  : int  1 0 0 0 0 1 0 0 1 0 ...
##  $ regattend  : int  0 0 1 0 0 0 1 1 0 1 ...
##  $ y94        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ y96        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ y98        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ y00        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ y02        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ y04        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ y06        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ unem10     : int  1 0 NA 1 0 NA NA 0 NA 1 ...
##  - attr(*, "time.stamp")= chr "22 Jan 2013 14:09"
##  - attr(*, "label.table")=List of 15
##   ..$ wrkstat : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr [1:10] "iap" "working fulltime" "working parttime" "temp not working" ...
##   ..$ prestg80: Named num 0
##   .. ..- attr(*, "names")= chr "dk,na,iap"
##   ..$ divorce : Named num [1:5] 0 1 2 8 9
##   .. ..- attr(*, "names")= chr [1:5] "iap" "yes" "no" "dk" ...
##   ..$ widowed : Named num [1:5] 0 1 2 8 9
##   .. ..- attr(*, "names")= chr [1:5] "iap" "yes" "no" "dk" ...
##   ..$ educ    : Named num [1:3] 97 98 99
##   .. ..- attr(*, "names")= chr [1:3] "iap" "dk" "na"
##   ..$ reg16   : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr [1:10] "foreign" "new england" "middle atlantic" "e. nor. central" ...
##   ..$ babies  : Named num [1:2] 8 9
##   .. ..- attr(*, "names")= chr [1:2] "8 or more" "na"
##   ..$ preteen : Named num [1:2] 8 9
##   .. ..- attr(*, "names")= chr [1:2] "8 or more" "na"
##   ..$ teens   : Named num [1:2] 8 9
##   .. ..- attr(*, "names")= chr [1:2] "8 or more" "na"
##   ..$ income  : Named num [1:16] 0 1 2 3 4 5 6 7 8 9 ...
##   .. ..- attr(*, "names")= chr [1:16] "iap" "lt $1000" "$1000 to 2999" "$3000 to 3999" ...
##   ..$ region  : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr [1:10] "not assigned" "new england" "middle atlantic" "e. nor. central" ...
##   ..$ attend  : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr [1:10] "never" "lt once a year" "once a year" "sevrl times a yr" ...
##   ..$ happy   : Named num [1:6] 0 1 2 3 8 9
##   .. ..- attr(*, "names")= chr [1:6] "iap" "very happy" "pretty happy" "not too happy" ...
##   ..$ owngun  : Named num [1:6] 0 1 2 3 8 9
##   .. ..- attr(*, "names")= chr [1:6] "iap" "yes" "no" "refused" ...
##   ..$ tvhours : Named num [1:3] -1 98 99
##   .. ..- attr(*, "names")= chr [1:3] "iap" "dk" "na"

The given dataset has 17137 observations and 33 variables

summary(data2)
##       year                  workstat       prestige     divorce    
##  Min.   :1994   working fulltime:9214   Min.   :17.00   iap :   0  
##  1st Qu.:1996   retired         :2459   1st Qu.:33.00   yes :2333  
##  Median :1998   keeping house   :1912   Median :43.00   no  :7421  
##  Mean   :1999   working parttime:1812   Mean   :43.88   dk  :   0  
##  3rd Qu.:2004   school          : 521   3rd Qu.:51.00   na  :   0  
##  Max.   :2006   (Other)         :1216   Max.   :86.00   NA's:7383  
##                 NA's            :   3   NA's   :854                
##  widowed           educ                   reg16          babies      
##  iap :10718   Min.   : 0.00   e. nor. central:3148   Min.   :0.0000  
##  yes :  378   1st Qu.:12.00   south atlantic :2643   1st Qu.:0.0000  
##  no  :    0   Median :13.00   middle atlantic:2620   Median :0.0000  
##  dk  :    0   Mean   :13.32   pacific        :1738   Mean   :0.2071  
##  na  :    0   3rd Qu.:16.00   w. sou. central:1635   3rd Qu.:0.0000  
##  NA's: 6041   Max.   :20.00   w. nor. central:1294   Max.   :6.0000  
##               NA's   :44      (Other)        :4059   NA's   :101     
##     preteen           teens                   income                 region    
##  Min.   :0.0000   Min.   :0.0000   $25000 or more:9725   south atlantic :3340  
##  1st Qu.:0.0000   1st Qu.:0.0000   $20000 - 24999:1278   e. nor. central:2881  
##  Median :0.0000   Median :0.0000   $10000 - 14999:1251   middle atlantic:2414  
##  Mean   :0.2557   Mean   :0.1814   $15000 - 19999:1099   pacific        :2353  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   $8000 to 9999 : 399   w. sou. central:1782  
##  Max.   :6.0000   Max.   :7.0000   (Other)       :1293   w. nor. central:1247  
##  NA's   :101      NA's   :88       NA's          :2092   (Other)        :3120  
##               attend               happy          owngun        tvhours      
##  never           :3213   iap          :   0   iap    :7180   Min.   : 0.000  
##  every week      :3041   very happy   :5260   yes    :4117   1st Qu.: 1.000  
##  once a year     :2209   pretty happy :9791   no     :   0   Median : 2.000  
##  sevrl times a yr:2118   not too happy:2086   refused:   0   Mean   : 2.904  
##  2-3x a month    :1495   dk           :   0   dk     :   0   3rd Qu.: 4.000  
##  (Other)         :4788   na           :   0   na     :   0   Max.   :24.000  
##  NA's            : 273                        NA's   :5840   NA's   :5343    
##      vhappy         mothfath16        black           gwbush04    
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :1.000   Median :0.0000   Median :1.000  
##  Mean   :0.3069   Mean   :0.693   Mean   :0.1384   Mean   :0.502  
##  3rd Qu.:1.0000   3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.:1.000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.000  
##                   NA's   :5                        NA's   :15207  
##      female        blackfemale         gwbush00       occattend    
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.000   1st Qu.:0.000  
##  Median :1.0000   Median :0.00000   Median :1.000   Median :0.000  
##  Mean   :0.5591   Mean   :0.08905   Mean   :0.522   Mean   :0.285  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:1.000   3rd Qu.:1.000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.000   Max.   :1.000  
##                                     NA's   :13701   NA's   :273    
##    regattend           y94              y96              y98        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.1312   Mean   :0.1737   Mean   :0.1683   Mean   :0.1637  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  NA's   :273                                                        
##       y00             y02               y04               y06        
##  Min.   :0.000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.000   Median :0.00000   Median :0.00000   Median :0.0000  
##  Mean   :0.162   Mean   :0.07989   Mean   :0.07802   Mean   :0.1742  
##  3rd Qu.:0.000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :1.000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000  
##                                                                      
##      unem10     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.318  
##  3rd Qu.:1.000  
##  Max.   :1.000  
##  NA's   :5796
colSums(is.na(data2))
##        year    workstat    prestige     divorce     widowed        educ 
##           0           3         854        7383        6041          44 
##       reg16      babies     preteen       teens      income      region 
##           0         101         101          88        2092           0 
##      attend       happy      owngun     tvhours      vhappy  mothfath16 
##         273           0        5840        5343           0           5 
##       black    gwbush04      female blackfemale    gwbush00   occattend 
##           0       15207           0           0       13701         273 
##   regattend         y94         y96         y98         y00         y02 
##         273           0           0           0           0           0 
##         y04         y06      unem10 
##           0           0        5796

Here the rows with missing values with gwbush04 having the highest number of missing values whereas workstat has the least number of missing values with only 3

colMeans(is.na(data2))
##         year     workstat     prestige      divorce      widowed         educ 
## 0.0000000000 0.0001750598 0.0498336932 0.4308221976 0.3525121083 0.0025675439 
##        reg16       babies      preteen        teens       income       region 
## 0.0000000000 0.0058936803 0.0058936803 0.0051350878 0.1220750423 0.0000000000 
##       attend        happy       owngun      tvhours       vhappy   mothfath16 
## 0.0159304429 0.0000000000 0.3407831009 0.3117815254 0.0000000000 0.0002917664 
##        black     gwbush04       female  blackfemale     gwbush00    occattend 
## 0.0000000000 0.8873781875 0.0000000000 0.0000000000 0.7994981619 0.0159304429 
##    regattend          y94          y96          y98          y00          y02 
## 0.0159304429 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000 
##          y04          y06       unem10 
## 0.0000000000 0.0000000000 0.3382155570

From the above data gwbush04 has the highest percentage of rows with missing value which is around ~80% and workstat with the least percentage.

data2$prestige[is.na(data2$prestige)] <- round(mean(data2$prestige, na.rm = TRUE))
data2$educ[is.na(data2$educ)] <- round(mean(data2$educ, na.rm = TRUE))
data2$educ[is.na(data2$educ)] <- round(mean(data2$educ, na.rm = TRUE))
data2$babies[is.na(data2$babies)] <- round(mean(data2$babies, na.rm = TRUE))
data2$preteen[is.na(data2$preteen)] <- round(mean(data2$preteen, na.rm = TRUE))
data2$teens[is.na(data2$teens)] <- round(mean(data2$teens, na.rm = TRUE))
data2$tvhours[is.na(data2$tvhours)] <- round(mean(data2$tvhours, na.rm = TRUE))
data2$mothfath16[is.na(data2$mothfath16)] <- round(mean(data2$mothfath16, na.rm = TRUE))
data2$gwbush04[is.na(data2$gwbush04)] <- round(mean(data2$gwbush04, na.rm = TRUE))
data2$gwbush00[is.na(data2$gwbush00)] <- round(mean(data2$gwbush00, na.rm = TRUE))
data2$occattend[is.na(data2$occattend)] <- round(mean(data2$occattend, na.rm = TRUE))
data2$regattend[is.na(data2$regattend)] <- round(mean(data2$regattend, na.rm = TRUE))
data2$unem10[is.na(data2$unem10)] <- round(mean(data2$unem10, na.rm = TRUE))

From the above data we are replacing the missing values with mean of columns for numeric attributes

Modes <- function(x) {
  ux <- na.omit(unique(x))
  tab <- tabulate(match(x, ux));ux[tab == max(tab)]
}
data2$workstat[is.na(data2$workstat)]<- Modes(data2$workstat)
data2$divorce[is.na(data2$divorce)]<- Modes(data2$divorce)
data2$widowed[is.na(data2$widowed)]<- Modes(data2$widowed)
data2$income[is.na(data2$income)]<- Modes(data2$income)
data2$attend[is.na(data2$attend)]<- Modes(data2$attend)
data2$owngun[is.na(data2$owngun)]<- Modes(data2$owngun)

The above variables are categorical so we are replacing the missing values with mode of the column

colSums(is.na(data2))
##        year    workstat    prestige     divorce     widowed        educ 
##           0           0           0           0           0           0 
##       reg16      babies     preteen       teens      income      region 
##           0           0           0           0           0           0 
##      attend       happy      owngun     tvhours      vhappy  mothfath16 
##           0           0           0           0           0           0 
##       black    gwbush04      female blackfemale    gwbush00   occattend 
##           0           0           0           0           0           0 
##   regattend         y94         y96         y98         y00         y02 
##           0           0           0           0           0           0 
##         y04         y06      unem10 
##           0           0           0

After replacing with mean and mode we can see that there are no more missing values in the data

data2$vhappy<-as.factor(data2$vhappy)
data2$mothfath16<-as.factor(data2$mothfath16)
data2$black<-as.factor(data2$black)
data2$gwbush04<-as.factor(data2$gwbush04)
data2$female<-as.factor(data2$female)
data2$blackfemale<-as.factor(data2$blackfemale)
data2$gwbush00<-as.factor(data2$gwbush00)
data2$occattend<-as.factor(data2$occattend)
data2$regattend<-as.factor(data2$regattend)
data2$y94<-as.factor(data2$y94)
data2$y96<-as.factor(data2$y96)
data2$y98<-as.factor(data2$y98)
data2$y00<-as.factor(data2$y00)
data2$y02<-as.factor(data2$y02)
data2$y04<-as.factor(data2$y04)
data2$y06<-as.factor(data2$y06)
data2$unem10<-as.factor(data2$unem10)

Few of the integer variables in the dataset are being converted to factor variables

str(data2)
## 'data.frame':    17137 obs. of  33 variables:
##  $ year       : int  1994 1994 1994 1994 1994 1994 1994 1994 1994 1994 ...
##  $ workstat   : Factor w/ 10 levels "iap","working fulltime",..: 8 2 2 2 3 3 2 2 2 2 ...
##  $ prestige   : num  46 22 29 42 36 43 20 44 42 46 ...
##  $ divorce    : Factor w/ 5 levels "iap","yes","no",..: 3 3 3 2 3 3 3 3 3 3 ...
##  $ widowed    : Factor w/ 5 levels "iap","yes","no",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ educ       : num  12 12 12 8 13 15 9 12 12 12 ...
##  $ reg16      : Factor w/ 10 levels "foreign","new england",..: 3 1 1 1 3 3 3 1 1 3 ...
##  $ babies     : num  2 0 0 0 0 0 0 0 2 0 ...
##  $ preteen    : num  3 0 0 0 1 0 1 0 1 0 ...
##  $ teens      : num  0 0 0 0 1 0 1 0 0 1 ...
##  $ income     : Factor w/ 16 levels "iap","lt $1000",..: 10 13 11 11 10 13 12 10 13 11 ...
##  $ region     : Factor w/ 10 levels "not assigned",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ attend     : Factor w/ 10 levels "never","lt once a year",..: 4 8 9 3 3 4 9 9 4 9 ...
##  $ happy      : Factor w/ 6 levels "iap","very happy",..: 3 2 3 4 4 3 4 2 3 3 ...
##  $ owngun     : Factor w/ 6 levels "iap","yes","no",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ tvhours    : num  2 3 1 3 3 0 10 4 2 3 ...
##  $ vhappy     : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 2 1 1 ...
##  $ mothfath16 : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 2 1 2 ...
##  $ black      : Factor w/ 2 levels "0","1": 2 2 2 1 2 2 1 2 2 2 ...
##  $ gwbush04   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ female     : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 2 1 2 2 ...
##  $ blackfemale: Factor w/ 2 levels "0","1": 2 1 2 1 2 2 1 1 2 2 ...
##  $ gwbush00   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ occattend  : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 1 2 1 ...
##  $ regattend  : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 2 2 1 2 ...
##  $ y94        : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ y96        : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ y98        : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ y00        : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ y02        : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ y04        : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ y06        : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ unem10     : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 1 1 2 ...
##  - attr(*, "time.stamp")= chr "22 Jan 2013 14:09"
##  - attr(*, "label.table")=List of 15
##   ..$ wrkstat : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr [1:10] "iap" "working fulltime" "working parttime" "temp not working" ...
##   ..$ prestg80: Named num 0
##   .. ..- attr(*, "names")= chr "dk,na,iap"
##   ..$ divorce : Named num [1:5] 0 1 2 8 9
##   .. ..- attr(*, "names")= chr [1:5] "iap" "yes" "no" "dk" ...
##   ..$ widowed : Named num [1:5] 0 1 2 8 9
##   .. ..- attr(*, "names")= chr [1:5] "iap" "yes" "no" "dk" ...
##   ..$ educ    : Named num [1:3] 97 98 99
##   .. ..- attr(*, "names")= chr [1:3] "iap" "dk" "na"
##   ..$ reg16   : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr [1:10] "foreign" "new england" "middle atlantic" "e. nor. central" ...
##   ..$ babies  : Named num [1:2] 8 9
##   .. ..- attr(*, "names")= chr [1:2] "8 or more" "na"
##   ..$ preteen : Named num [1:2] 8 9
##   .. ..- attr(*, "names")= chr [1:2] "8 or more" "na"
##   ..$ teens   : Named num [1:2] 8 9
##   .. ..- attr(*, "names")= chr [1:2] "8 or more" "na"
##   ..$ income  : Named num [1:16] 0 1 2 3 4 5 6 7 8 9 ...
##   .. ..- attr(*, "names")= chr [1:16] "iap" "lt $1000" "$1000 to 2999" "$3000 to 3999" ...
##   ..$ region  : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr [1:10] "not assigned" "new england" "middle atlantic" "e. nor. central" ...
##   ..$ attend  : Named num [1:10] 0 1 2 3 4 5 6 7 8 9
##   .. ..- attr(*, "names")= chr [1:10] "never" "lt once a year" "once a year" "sevrl times a yr" ...
##   ..$ happy   : Named num [1:6] 0 1 2 3 8 9
##   .. ..- attr(*, "names")= chr [1:6] "iap" "very happy" "pretty happy" "not too happy" ...
##   ..$ owngun  : Named num [1:6] 0 1 2 3 8 9
##   .. ..- attr(*, "names")= chr [1:6] "iap" "yes" "no" "refused" ...
##   ..$ tvhours : Named num [1:3] -1 98 99
##   .. ..- attr(*, "names")= chr [1:3] "iap" "dk" "na"
data2$happy<-droplevels(data2$happy)

Here er have dropped the unused levels from the happy variable which are DK,idap and na

library(ggplot2)

ggplot(data2, aes(x=happy, fill=workstat))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=divorce))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=widowed))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=reg16))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=income))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=region))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=attend))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=owngun))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=vhappy))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=mothfath16))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=black))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=gwbush04))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=female))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=blackfemale))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=gwbush00))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=occattend))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=regattend))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=y94))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=y98))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=y00))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=y02))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=y04))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=y06))+geom_bar(position = "dodge")

ggplot(data2, aes(x=happy, fill=unem10))+geom_bar(position = "dodge")

Barplots are used to visualize the relationship between factor variables

boxplot(data2$prestige~data2$happy)

boxplot(data2$educ~data2$happy)

boxplot(data2$babies~data2$happy)

boxplot(data2$preteen~data2$happy)

boxplot(data2$teens~data2$happy)

boxplot(data2$tvhours~data2$happy)

Boxplots are used to establish the relationship between factor and numeric variables

levels(data2$happy)[levels(data2$happy)=="pretty happy"]<-"very happy"

Here we have reduced the levels of happy variable from three to two as “pretty happy” and “very happy” are quite similar, so we merged them into one and with two levels it also hepls us to find percision, recall

set.seed(123)
library(lattice)
library(caret)
train.index=createDataPartition(data2$happy, p=0.7, list = FALSE)
data2_train<-data2[train.index, ]
data2_test <-data2[-train.index, ]
data2_train_labels = data2[train.index, 14]
data2_test_labels = data2[-train.index,14]

We have split the data to 70% training and 30% testing

#LOGISTIC REGRESSION

set.seed(1)
data2_logistic<-glm(happy~., data=data2_train, family = "binomial")
summary(data2_logistic)
## 
## Call:
## glm(formula = happy ~ ., family = "binomial", data = data2_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5935  -0.5750  -0.4185   0.0000   2.4590  
## 
## Coefficients: (4 not defined because of singularities)
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               6.295e+01  1.357e+02   0.464 0.642855    
## year                     -3.136e-02  6.767e-02  -0.463 0.643078    
## workstatworking parttime  5.449e-02  1.030e-01   0.529 0.596799    
## workstattemp not working  6.287e-01  1.884e-01   3.337 0.000846 ***
## workstatunempl, laid off  7.380e-01  1.388e-01   5.316 1.06e-07 ***
## workstatretired           3.715e-01  9.859e-02   3.768 0.000164 ***
## workstatschool            1.357e-01  1.693e-01   0.801 0.422992    
## workstatkeeping house     2.261e-01  1.061e-01   2.131 0.033116 *  
## workstatother             9.292e-01  1.569e-01   5.923 3.16e-09 ***
## prestige                 -3.147e-03  2.711e-03  -1.161 0.245838    
## divorceno                 2.113e-01  9.960e-02   2.121 0.033884 *  
## widowedyes                2.188e-01  1.938e-01   1.129 0.258885    
## educ                     -3.478e-02  1.205e-02  -2.888 0.003883 ** 
## reg16new england         -6.236e-01  2.305e-01  -2.705 0.006828 ** 
## reg16middle atlantic     -4.396e-01  1.407e-01  -3.123 0.001788 ** 
## reg16e. nor. central     -5.076e-01  1.545e-01  -3.286 0.001015 ** 
## reg16w. nor. central     -6.049e-01  2.001e-01  -3.022 0.002508 ** 
## reg16south atlantic      -4.846e-01  1.457e-01  -3.326 0.000882 ***
## reg16e. sou. central     -1.145e-01  2.081e-01  -0.550 0.582204    
## reg16w. sou. central     -6.242e-01  1.868e-01  -3.342 0.000833 ***
## reg16mountain            -5.843e-01  2.060e-01  -2.837 0.004557 ** 
## reg16pacific             -4.731e-01  1.569e-01  -3.016 0.002563 ** 
## babies                   -1.139e-01  6.006e-02  -1.897 0.057863 .  
## preteen                   3.780e-02  4.990e-02   0.757 0.448774    
## teens                    -4.670e-02  6.324e-02  -0.738 0.460223    
## income$1000 to 2999      -2.743e-01  3.181e-01  -0.863 0.388410    
## income$3000 to 3999      -3.718e-01  3.238e-01  -1.148 0.250905    
## income$4000 to 4999      -5.608e-01  3.386e-01  -1.656 0.097672 .  
## income$5000 to 5999      -8.615e-01  3.353e-01  -2.569 0.010186 *  
## income$6000 to 6999      -3.320e-01  3.038e-01  -1.093 0.274555    
## income$7000 to 7999      -2.988e-01  2.993e-01  -0.998 0.318228    
## income$8000 to 9999      -1.862e-01  2.715e-01  -0.686 0.492697    
## income$10000 - 14999     -6.009e-01  2.494e-01  -2.410 0.015957 *  
## income$15000 - 19999     -7.859e-01  2.561e-01  -3.068 0.002153 ** 
## income$20000 - 24999     -6.312e-01  2.531e-01  -2.494 0.012642 *  
## income$25000 or more     -9.889e-01  2.385e-01  -4.146 3.39e-05 ***
## regionmiddle atlantic     2.654e-02  2.276e-01   0.117 0.907178    
## regione. nor. central    -8.047e-02  2.370e-01  -0.340 0.734226    
## regionw. nor. central    -2.073e-01  2.674e-01  -0.775 0.438149    
## regionsouth atlantic     -5.045e-02  2.224e-01  -0.227 0.820510    
## regione. sou. central    -4.419e-01  2.821e-01  -1.567 0.117230    
## regionw. sou. central     3.379e-02  2.553e-01   0.132 0.894701    
## regionmountain           -1.111e-01  2.555e-01  -0.435 0.663822    
## regionpacific             2.406e-02  2.351e-01   0.102 0.918468    
## attendlt once a year     -1.480e-01  1.149e-01  -1.288 0.197893    
## attendonce a year        -2.368e-01  1.021e-01  -2.319 0.020389 *  
## attendsevrl times a yr   -2.016e-01  1.036e-01  -1.946 0.051695 .  
## attendonce a month       -3.275e-01  1.315e-01  -2.490 0.012759 *  
## attend2-3x a month       -3.247e-01  1.238e-01  -2.623 0.008704 ** 
## attendnrly every week    -4.999e-01  1.529e-01  -3.270 0.001075 ** 
## attendevery week         -4.444e-01  1.026e-01  -4.331 1.48e-05 ***
## attendmore thn once wk   -2.028e-01  1.367e-01  -1.483 0.137999    
## owngunyes                -1.551e-01  8.067e-02  -1.923 0.054492 .  
## tvhours                   2.632e-02  1.422e-02   1.851 0.064180 .  
## vhappy1                  -2.350e+01  2.954e+03  -0.008 0.993653    
## mothfath161              -7.856e-02  6.469e-02  -1.214 0.224594    
## black1                    3.407e-01  1.322e-01   2.576 0.009985 ** 
## gwbush041                 1.356e-01  1.743e-01   0.778 0.436467    
## female1                   7.882e-02  7.080e-02   1.113 0.265597    
## blackfemale1              3.247e-02  1.605e-01   0.202 0.839710    
## gwbush001                -4.682e-02  1.301e-01  -0.360 0.718895    
## occattend1                       NA         NA      NA       NA    
## regattend1                       NA         NA      NA       NA    
## y941                     -5.385e-01  7.681e-01  -0.701 0.483241    
## y961                     -4.246e-01  6.339e-01  -0.670 0.503049    
## y981                     -3.444e-01  5.012e-01  -0.687 0.492005    
## y001                     -5.106e-01  3.708e-01  -1.377 0.168503    
## y021                     -4.190e-01  2.476e-01  -1.692 0.090599 .  
## y041                             NA         NA      NA       NA    
## y061                             NA         NA      NA       NA    
## unem101                   4.560e-01  6.886e-02   6.622 3.55e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8888.8  on 11996  degrees of freedom
## Residual deviance: 7230.3  on 11930  degrees of freedom
## AIC: 7364.3
## 
## Number of Fisher Scoring iterations: 24
data2_pred<-predict(data2_logistic, data2_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
data2_new_label=factor(ifelse(data2_pred>0.5, "1","0"))
set.seed(1)
actuallabel<-data2_test$happy
t=table(data2_new_label,actuallabel)
t
##                actuallabel
## data2_new_label very happy not too happy
##               0       4501           609
##               1         14            16

From the table we will get the true positive value, true negative, false positive and false negative

TP<-4501
FN<-14
FP<-609
TN<-16
accuracy_logistic<-(TP+TN)/(TP+TN+FP+FN)
accuracy_logistic
## [1] 0.8787938

The accuracy is 0.8787938

percision_logistic<-TP/(TP+FP)
percision_logistic
## [1] 0.8808219

The percision is 0.8808219

recall_logistic<-TP/(TP+FN)
recall_logistic
## [1] 0.9968992

The recall is 0.9968992

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
auc(data2_test$happy, data2_pred)
## Setting levels: control = very happy, case = not too happy
## Setting direction: controls < cases
## Area under the curve: 0.7807

AUC is 0.7807

#RANDOM FOREST

set.seed(1)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
data2_rf <- randomForest(happy ~ ., data = data2_train)
data2_rf
## 
## Call:
##  randomForest(formula = happy ~ ., data = data2_train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 12.23%
## Confusion matrix:
##               very happy not too happy class.error
## very happy         10514            22 0.002088079
## not too happy       1445            16 0.989048597
set.seed(1)
data2_rf_predictions=predict(data2_rf, data2_test)
table(data2_rf_predictions,data2_test$happy)
##                     
## data2_rf_predictions very happy not too happy
##        very happy          4508           623
##        not too happy          7             2
TP<-4508
FN<-7
FP<-623
TN<-2
accuracy_random<-(TP+TN)/(TP+TN+FP+FN)
accuracy_random
## [1] 0.8774319

The accuracy is 0.8774319

percision_random<-TP/(TP+FP)
percision_random
## [1] 0.8785812

The percision is 0.8785812

recall_random<-TP/(TP+FN)
recall_random
## [1] 0.9984496

The recall is 0.9984496

library(pROC)
auc(data2_test$happy, as.numeric(data2_rf_predictions))
## Setting levels: control = very happy, case = not too happy
## Setting direction: controls < cases
## Area under the curve: 0.5008

The AUC is 0.5008

#SVM

library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
data2_svm <- ksvm(happy ~ ., data = data2_train, kernel = "vanilladot")
##  Setting default kernel parameters
data2_svm
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 4193 
## 
## Objective Function Value : -2922 
## Training error : 0.12178
svm_pred<-predict(data2_svm,data2_test)
table(svm_pred, data2_test$happy)
##                
## svm_pred        very happy not too happy
##   very happy          4515           625
##   not too happy          0             0
TP<-4515
FN<-0
FP<-625
TN<-0
accuracy_svm<-(TP+TN)/(TP+TN+FP+FN)
accuracy_svm
## [1] 0.8784047

The accuracy is 0.8784047

percision_svm<-TP/(TP+FP)
percision_svm
## [1] 0.8784047

The percision is 0.8784047

recall_svm<-TP/(TP+FN)
recall_svm
## [1] 1

The recall is 1

auc(data2_test$happy, as.numeric(svm_pred))
## Setting levels: control = very happy, case = not too happy
## Setting direction: controls < cases
## Area under the curve: 0.5

The AUC is 0.5